Pancreas TRAs
# ----------------------------------------------------------
# TRAs
# ----------------------------------------------------------
# set working directory to folder with TRA tables (rawdata/TRA Daten)
projectPath <- dirname(rstudioapi::getSourceEditorContext()$path)
setwd(paste(projectPath, "rawdata", "TRA Daten", sep = "/"))
# read in all 6 tables
TRAs1_human <-read_csv("Human_protein_atlas_TRA_5median_genes_annotated.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Ensembl_human_genes = col_character(),
## Chromosome = col_character(),
## Startsite = col_double(),
## Strand = col_double(),
## Band = col_character(),
## Symbol = col_character(),
## EntrezGene = col_double(),
## Unigene = col_character(),
## Tissue_number = col_double(),
## Tissues = col_character(),
## Max_tissue = col_character()
## )
TRAs2_human <-read_tsv("tra.2014.human.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs3_human <-read_tsv("tra.2014.human.roth.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs4_mouse <-read_tsv("tra.2014.mouse.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs5_mouse <-read_tsv("tra.2014.mouse.4301.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs6_human <-read_tsv("tra.2017.human.gtex.5x.table.tsv",col_types = cols(ensembl.chrom=col_character()))
#col_types to correct the parsing failure
PancreasTRAs1_human <- TRAs1_human[grep(x=TRAs1_human$Max_tissue,"panc"),]
PancreasTRAs2_human <- TRAs2_human[grep(x=TRAs2_human$max.tissue,"Panc"),]
PancreasTRAs3_human <- TRAs3_human[grep(x=TRAs3_human$max.tissue,"Panc"),]
PancreasTRAs4_mouse <- TRAs4_mouse[grep(x=TRAs4_mouse$max.tissue,"panc"),]
PancreasTRAs5_mouse <- TRAs5_mouse[grep(x=TRAs5_mouse$max.tissue,"panc"),]
PancreasTRAs6_human <- TRAs6_human[grep(x=TRAs6_human$max.tissue,"Panc"),]
# create character vectors with pancreas specific genes (for both humans & mice)
pancreas_gene_human <- unique(c(PancreasTRAs1_human$Symbol,PancreasTRAs2_human$gene.symbol,PancreasTRAs6_human$ensembl.symbol))
TRA_pancreas_genes_human <- pancreas_gene_human
# all TRAs
TRA_genes_human <- unique(c(TRAs1_human$Symbol,TRAs2_human$gene.symbol,TRAs6_human$ensembl.symbol))
Creating expression matrix
# ----------------------------------------------------------
# Create vsnrma normalized expression matrix
# ----------------------------------------------------------
set.seed(234) # with this vsnrma gives out the same values each time
# get CEL files
setwd(paste(projectPath, "rawdata", "rawdata breast GSE27830", sep = "/"))
data_breast <- ReadAffy()
# change cdf
data_breast@cdfName <- "HGU133Plus2_Hs_ENST"
# create expression matrix
breast_matrix <- exprs(data_breast)
# store colnames (microarray)
breast_microarray_information <- colnames(breast_matrix)
# vsnrma normalization
breast_vsnrma <- vsnrma(data_breast)
## Calculating Expression
# expression matrix of vsnrm normalized data set
breast_vsnrma_matrix <- exprs(breast_vsnrma)
# cut rownames at "."
rownames(breast_vsnrma_matrix) <- str_replace(rownames(breast_vsnrma_matrix),"\\..*","")
# remove ending .CEL from colnames
colnames(breast_vsnrma_matrix) <- str_remove(colnames(breast_vsnrma_matrix),"_.CEL")
# remove control probes (AFFX)
breast_vsnrma_matrix <- breast_vsnrma_matrix[which(!startsWith(rownames(breast_vsnrma_matrix), "AFFX")),]
# transcript IDs
breast_transcript_names <- rownames(breast_vsnrma_matrix)
# ----------------------------------------------------------
# Create data frames which hold information about microarrays
# ----------------------------------------------------------
# create data frame with information about microarray samples
breast_microarrays <- data.frame(number = substr(breast_microarray_information, 1,9),
row.names = c(1:10))
# replace old IDs sample names
colnames(breast_matrix) <- breast_microarrays[,"number"]
colnames(breast_vsnrma_matrix) <- breast_microarrays[,"number"]
# ----------------------------------------------------------
# Convert transcript IDs to gene symbols
# ----------------------------------------------------------
# get table for converting transcript IDs to gene symbols
setwd(paste(projectPath, "rawdata", "tables" ,sep = "/"))
annotation <- read.csv("ensemble.103.txt")
ensemble_transcripts <- annotation[,"Transcript.stable.ID"]
ensemble_symbol <- annotation[,"HGNC.symbol"]
# name each gene symbol by their respective transcripts
names(ensemble_symbol) <- ensemble_transcripts
# select only those transcripts that are also present in the annotation
breast_GeneExprs <- breast_vsnrma_matrix[rownames(breast_vsnrma_matrix) %in% ensemble_transcripts,]
# create vector with the symbols of the transcript from expression matrix
breast_symbol <- ensemble_symbol[rownames(breast_GeneExprs)]
# use gene symbols instead of transcripts as rownames
rownames(breast_GeneExprs) <- as.character(breast_symbol)
# order genes alphabetically & remove all those rows where "" is the rowname
breast_GeneExprs <- breast_GeneExprs[order(rownames(breast_GeneExprs)),][1097:nrow(breast_GeneExprs),]
# replace colnames with the GSM number of the microarrays
colnames(breast_GeneExprs) <- breast_microarrays[,"number"]
# ----------------------------------------------------------
# Breast cancer specific TRAs
# ----------------------------------------------------------
# create a expression matrix with only those genes
breast_GeneExprs_sub <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_pancreas_genes_human),]
# ----------------------------------------------------------
# TRAs for all tissues
# ----------------------------------------------------------
breast_GeneExprs_allTRA <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_genes_human),]
# ----------------------------------------------------------
# combining transcripts for same gene via median
# ----------------------------------------------------------
combineGeneExprs_median2 <- function(exprsmatrix){
# create data frame with a column of all the gene symbols
data <- data.frame(geneSymbols = rownames(exprsmatrix),
exprsmatrix,
row.names = NULL)
# group data frame by gene symbols and combine values of one gene by median
combined <- aggregate(data[-1], by = list(data$geneSymbols) , FUN = median)
# convert data frame to matrix (only expression values, no row names)
result <- as.matrix(combined[,-1])
# use gene symbols as row names
rownames(result) <- combined[,1]
# output
result
}
breast_GeneExprs_combined <- combineGeneExprs_median2(breast_GeneExprs)
breast_GeneExprs_sub_combined <- combineGeneExprs_median2(breast_GeneExprs_sub)
Quality control
# ----------------------------------------------------------
# Single chip control
# ----------------------------------------------------------
#setwd(paste(projectPath, "plots", sep = "/"))
for (i in 1:10){
image(data_breast[,i], col = rainbow (100, start = 0, end = 0.75)[100:1])
file.name = paste(as.character(breast_microarrays[, "number"])[i],".pdf", sep = "")
#dev.copy2pdf(file = file.name)
}










## The chips do not appear to be faulty.
# ----------------------------------------------------------
# Pheno Data
# ----------------------------------------------------------
pData(data_breast)
## sample
## GSM687017.CEL 1
## GSM687018.CEL 2
## GSM687019.CEL 3
## GSM687020.CEL 4
## GSM687021.CEL 5
## GSM687022.CEL 6
## GSM687023.CEL 7
## GSM687024.CEL 8
## GSM687025.CEL 9
## GSM687026.CEL 10
# ----------------------------------------------------------
# MeanSdPlot
# ----------------------------------------------------------
meanSdPlot(breast_vsnrma, plot = FALSE)$gg + theme(aspect.ratio = 1)

#setwd(paste(projectPath, "plots", sep = "/"))
#dev.copy2pdf(file = "breast_meanSdPlot_vsnrma_normalized.pdf")
## Red line is mostly horizontal. Therefore, mean - standard deviation have no dependence. However, there is a slight upwards trend of the standard deviation.
# ----------------------------------------------------------
# RNA degradation plot: breast data set
# ----------------------------------------------------------
RNAdeg_breast <- AffyRNAdeg(data_breast)
# Shift and scale
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10))
title(sub = "Breast Cancer Rawdata")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_rnadeg_rawdata.pdf")
# Shift
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10), transform = "shift.only")
title(sub = "Breast Cancer Scaled")

#dev.copy2pdf(file = "breast_rnadeg_shift_rawdata.pdf")
## No crossing of lines was detected. The quality of the chips is good.
# ----------------------------------------------------------
# Histogram before and after normalization
# ----------------------------------------------------------
## Before normalization
par(mai = c(0.9,0.9,0.9,0.5))
hist(data_breast,
col = rainbow(10),
main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nbefore normalization")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_rawdata.pdf")
## After normalization
par(mai = c(0.9,0.9,0.9,0.5))
plot(density(breast_vsnrma_matrix),
type = "n", xlab = "log Intensity", ylim = c(0, 0.6),
main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nafter normalization")
for (i in 1:ncol(breast_vsnrma_matrix)){
lines(density(breast_vsnrma_matrix[,i]), col = rainbow(10)[i])
}

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_vsnrma_normalized.pdf")
# ----------------------------------------------------------
# Boxplot before and after normalization
# ----------------------------------------------------------
## Before normalization
par(las = 2, mai = c(1, 0.8, 0.7, 0.1))
boxplot(data_breast, names = breast_microarrays[, "number"],
col = rainbow(10), ylab = "Intensity values",
main = "Gene expression in breast cancer before vsnrma normalization\ndata set: GSE27830 (Foekens et al.)",
cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_rawdata.pdf")
## After normalization
par(las = 2, mai = c(1, 0.8, 0.7, 0.1))
boxplot(breast_vsnrma_matrix, names = breast_microarrays[, "number"],
col = rainbow(10), ylab = "Intensity values",
main = "Gene expression in breast cancer after vsnrma normalization\ndata set: GSE27830 (Foekens et al.)",
cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_vsnrma_normalized.pdf")
# ----------------------------------------------------------
# Scatterplots
# ----------------------------------------------------------
#setwd(paste(projectPath, "plots" ,sep = "/"))
par(mai = c(0.9, 0.9, 0.7, 0.3))
for (i in 1:9) {
for (j in (i+1):10){
plot(breast_vsnrma_matrix[,i], breast_vsnrma_matrix[,j], pch = ".",
xlab = breast_microarrays[, "number"][i],
ylab = breast_microarrays[, "number"][j])
abline(0, 1, col = "red")
title(main = paste("Scatterplot of probe",
breast_microarrays[, "number"] [i],
"and",
breast_microarrays[, "number"] [i+1],
sep = " ", collapse = NULL))
file.name = paste("Scatterplot_",
as.character(breast_microarrays[, "number"] [i],
"_", as.character(breast_microarrays[, "number"][i+1]), ".pdf",
sep =" "))
#dev.copy2pdf(file = file.name)
}
}













































## The scatterplots never show an abnormal distribution.
Descriptive Statistics
# ----------------------------------------------------------
# Boxplots containing all TRA genes present in breast cancer
# ----------------------------------------------------------
# Five boxplots are created, each counting 50 genes, to ensure better visibility and accessibility of each gene.
par(las = 2,
cex.axis = 0.5, cex.main = 0.8,
mai = c(0.65,0.4,0.4,0.1), mfrow = c(2,1))
for (i in seq(1,250,by = 50)){
generange = paste0("(", rownames(breast_GeneExprs_sub_combined)[i],
" - ",
rownames(breast_GeneExprs_sub_combined)[i+49],")")
boxplot(t(breast_GeneExprs_sub_combined[i:(i+49),filter(breast_microarrays)$number]),
col = rainbow(50),
main= paste0("Boxplot of TRA gene expression in breast cancer ", generange, "\ndata set: GSE27830 (Foekens et al.)"),
horizontal = FALSE)
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))
filename = paste0("breast_geneexpression_boxplot_",
rownames(breast_GeneExprs_sub_combined)[i],"_",
rownames(breast_GeneExprs_sub_combined)[i+49],".pdf")
print(filename)
setwd(paste(projectPath, "plots", sep = "/"))
# dev.copy2pdf(file = filename)
}
## [1] "breast_geneexpression_boxplot_AAK1_CPB1.pdf"

## [1] "breast_geneexpression_boxplot_CRACR2B_GRB10.pdf"
## [1] "breast_geneexpression_boxplot_GRK4_PAFAH2.pdf"

## [1] "breast_geneexpression_boxplot_PAK3_SEL1L.pdf"
## [1] "breast_geneexpression_boxplot_SERPINA3_ZNF780A.pdf"

# ----------------------------------------------------------
# Heatmaps
# ----------------------------------------------------------
filenames <- rownames(pData(data_breast))
breast_samples <- substr(filenames, 1, 9)
# Heatmap of the gene expression in breast cancer.
pheatmap(t(breast_GeneExprs_sub),
labels_row = paste(breast_samples),
labels_col = rep("",nrow(breast_GeneExprs_sub)),
main = "Gene expression in breast cancer \ndata set: GSE27830 (Foekens et al.)",
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize_row = 8, fontsize = 10)

# Gene expression of breast cancer after combining.
pheatmap(t(breast_GeneExprs_sub_combined),
labels_row = paste(breast_samples),
labels_col = rep("",nrow(breast_GeneExprs_sub_combined)),
main = "Gene expression in breast cancer after combination \ndata set: GSE27830 (Foekens et al.)",
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize_row = 8, fontsize = 10)

Analysis
## Mean of entire set
mean(breast_GeneExprs_sub_combined)
## [1] 7.241704
## 7.241704
## Median of entire set
median(breast_GeneExprs_sub_combined)
## [1] 6.766971
## 6.766971
## Standard deviation of entire set
sd(breast_GeneExprs_sub_combined)
## [1] 1.41852
## 1.41852
# ----------------------------------------------------------
# High variance
# ----------------------------------------------------------
## Calculate variance of rows (genes)
var_breast <- apply(breast_GeneExprs_sub_combined, 1, var, na.rm=TRUE)
## Create function to filter out the top 25% of variance.
top5 <- function(x) {
x >= quantile(x, 0.75)
}
## Apply function to variance set, to determine the top 5% of variance.
top5_breast <- top5(var_breast)
top5_breast_names <- which(top5_breast == "TRUE")
high_var_breast_sub <- breast_GeneExprs_sub_combined[top5_breast_names, , drop = FALSE]
dim(high_var_breast_sub)
## [1] 63 10
## data set complete: 250 genes
## data set high variance: 63 genes
# Boxplot of 25% high variance genes
par(las = 2)
par(mai = c(0.7,0.4,0.5,0.1))
boxplot(t(high_var_breast_sub),
col = rainbow(63),
cex.axis = 0.5,
main = "Top 25% high variance genes \ndata set: GSE27830 (Foekens et al.)");
abline(h = median(breast_GeneExprs_sub_combined), col = "red")

# ----------------------------------------------------------
# Genes with high expression levels
# ----------------------------------------------------------
# Median
breast_median <- apply(breast_GeneExprs_sub_combined, 1, median, na.rm=TRUE)
median(breast_median)
## [1] 6.754704
## Median: 6.754704
## Search for genes, whose median gene expression are more than 1.5 times that of the median.
which(breast_median > (1.5 * median(breast_median)))
## CYB5A GNAS NUPR1 RPL5 RPL8 RPS9 SERPINA3 SPP1
## 58 93 143 192 193 194 201 222
## SSR4 XBP1
## 225 246
## Names of genes that fit this criteria: 10 in total.
## CYB5A GNAS NUPR1 RPL5 RPL8 RPS9 SERPINA3 SPP1 SSR4 XBP1
## Expression values of those genes:
breast_median_Gene_Exprs <- breast_GeneExprs[which(breast_median > (1.5 * median(breast_median)))]
breast_median_Gene_Exprs
## [1] 6.690879 8.422208 7.805209 6.782207 5.740733 5.712064 7.076842 6.705710
## [9] 6.597868 5.970241
# Pancreas TRA with an median expression above the overall median:
medianbreast <- median(breast_GeneExprs_sub_combined)
length(which(apply(breast_GeneExprs_sub_combined,1,median) > medianbreast))
## [1] 124
# 124 genes have a higher expression than the overall median.
# Creating matrix with overexpressed TRAs in decreasing order:
overexpressedbreast <- breast_GeneExprs_sub_combined[which(apply(breast_GeneExprs_sub_combined,1,median)> medianbreast),]
overexpressedbreastordered <- overexpressedbreast[order(apply(overexpressedbreast,1,median),decreasing=TRUE),]
# Boxplot with all the genes that are higher expressed than overall median.
par(las =2, mai = c(0.7,0.6,0.6,0.2))
boxplot(t(overexpressedbreast),
col = rainbow(124),
cex.axis = 0.5,
main="Breast Cancer gene expression of overexpressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))

# Boxplot with the top 25 genes that are higher expressed than the median.
par(las =2, mai = c(1.2,0.6,0.6,0.2))
boxplot(t(overexpressedbreastordered[1:25,]),
col = rainbow(25),
cex.axis = 1,
main="Gene expression of the 25 highest expressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))
